########################################################
## PROGRAM NAME: 002_fin_zoning.R                     ##
## AUTHOR: MATT MLECZKO                               ##
## INPUTS:                                            ##
##    001_wrld_2006.Rda                               ##
##    001_nzlu_2022.Rda                               ##
##    001_nllus_2003.Rda                              ##
##                                                    ##
## OUTPUTS:                                           ##
##    002_wrld_nllus_place_2006.Rda                   ##
##    002_wrld_nllus_msasample_2006.Rda               ##
##    002_wrld_nllus_msa_2006.Rda                     ##
##    002_nzlu_muni_2022.Rda                          ##
##    002_nzlu_msasample_2022.Rda                     ##
##    002_nzlu_wmsas_2022.Rda                         ##
##    002_nzlu_msa_2022.Rda                           ##
##    002_tract_to_cs.Rda                             ##
##    002_tract_to_pl.Rda                             ##
##                                                    ##
## PURPOSE: Create place and MSA-level files          ##
##                                                    ##
## LIST OF UPDATES:                                   ##
########################################################

#log <- file(# USER DEFINED PATH AND FILE NAME HERE #)
#sink(log, append=TRUE)
#sink(log, append=TRUE, type="message")

## load libraries ## 

library("haven")
library("foreign")
library("dplyr")
library("tidyr")
library("stringr")
library("readxl")
library("writexl")
library("gdata")
library("tm")
library("tm")
library("gsubfn")

`%notin%` <- Negate(`%in%`)

## create a merge function that creates merge frequency as in Stata ##
## from user rwbuie at stackoverflow: https://stackoverflow.com/questions/30358401/is-there-a-way-to-create-statas-merge-indicator-variable-with-rs-merge ##
stata.merge <- function(x,y, name){
  x$df1 <- 1
  y$df2 <- 2
  df <- merge(x,y, by = name, all = TRUE)
  df$merge.variable <- rowSums(df[,c("df1", "df2")], na.rm=TRUE)
  df$df1 <- NULL
  df$df2<- NULL
  df
  #print(table(df$merge.variable))
  
  ## return the merged dataframe
  return(df)
}


## define paths
input_path <- # USER DEFINED PATH HERE #
output_path <- # USER DEFINED PATH HERE #

## set working directory
setwd(output_path)

## read-in data ## 
load("001_nzlu_2022.Rda")
load("001_wrld_2006.Rda")
load("001_nllus_2003.Rda")

###############
## FUNCTIONS ##
###############

## create ZRI ##

create.zri <- function(in.data){

  ## PCA ## 

  pca.data <- in.data %>%
    ungroup %>%
    select(sindex1, sindex2, sindex3, sindex4, sindex5, sindex6, sindex7) %>%
    drop_na()

  pca.res <- prcomp(pca.data, center= T, scale=T)
  print("PCA Results")
  print(summary(pca.res))
  loadings <- pca.res$rotation
  print("Loadings")
  print(loadings)

  ## final index ## 

  in.data$zri <- in.data$sindex1*loadings[1,1] + 
    in.data$sindex2*loadings[2,1] + 
    in.data$sindex3*loadings[3,1] + 
    in.data$sindex4*loadings[4,1] + 
    in.data$sindex5*loadings[5,1] + 
    in.data$sindex6*loadings[6,1] + 
    in.data$sindex7*loadings[7,1]

  ## standardized final index ## 

  in.data$zri_st <- (in.data$zri - mean(in.data$zri, na.rm=T))/sd(in.data$zri,na.rm=T)


  ## correlations of final index with subindices ## 

  print("Correlation between zri_st and sindex1")
  print(cor(in.data$sindex1,
            in.data$zri_st, use="complete.obs"))

  print("Correlation between zri_st and sindex2")
  print(cor(in.data$sindex2,
            in.data$zri_st, use="complete.obs"))

  print("Correlation between zri_st and sindex3")
  print(cor(in.data$sindex3,
            in.data$zri_st, use="complete.obs"))

  print("Correlation between zri_st and sindex4")
  print(cor(in.data$sindex4,
            in.data$zri_st, use="complete.obs"))

  print("Correlation between zri_st and sindex5")
  print(cor(in.data$sindex5,
            in.data$zri_st, use="complete.obs"))

  print("Correlation between zri_st and sindex6")
  print(cor(in.data$sindex6,
            in.data$zri_st, use="complete.obs"))
  
  print("Correlation between zri_st and sindex7")
  print(cor(in.data$sindex7,
            in.data$zri_st, use="complete.obs"))

  ## simple additive index ## 

  in.data$add_index <- rowSums(in.data[,c("sindex1",
                                          "sindex2",
                                          "sindex3",
                                          "sindex4",
                                          "sindex5",
                                          "sindex6",
                                          "sindex7")], na.rm=TRUE)

  print("Correlation between zri_st and add_index")
  print(cor(in.data$zri_st, 
            in.data$add_index,
            use="complete.obs"))

  ## density plot of final index ##

  print("Density plot of zri_st")
  plot(density(in.data$zri_st, na.rm=T))
  
  return(in.data)
}

## creat tract to muni crosswalks ##

## first, places ##

tract.to.pl.in <- readr::read_csv(paste0(input_path,
                                         "tract_to_place.csv"))

## clean up errors in the tract ID ##

tract.to.pl.rf <- tract.to.pl.in %>%
  mutate(tract.rf = str_replace_all(tract, '\\.', ''),
         fintract = str_pad(tract.rf, 6, "right", "0"))

range(nchar(trim(tract.to.pl.rf$fintract)))

## clean up errors in the tract ID ##

tract.to.pl.rf <- tract.to.pl.in %>%
  mutate(tract.rf = str_replace_all(tract, '\\.', ''),
         fintract = str_pad(tract.rf, 6, "right", "0"))

## fix FIPS code issues ##
tract.to.pl.rf$placefp[tract.to.pl.rf$county == "25011" & tract.to.pl.rf$placefp == "27100"] <- "27060"
tract.to.pl.rf$placefp[tract.to.pl.rf$county == "39113" & tract.to.pl.rf$placefp == "83090"] <- "83111"
tract.to.pl.rf$placefp[tract.to.pl.rf$county %in% c("47001","47013") & tract.to.pl.rf$placefp == "40240"] <- "64668"
tract.to.pl.rf$placefp[tract.to.pl.rf$county == "12099" & tract.to.pl.rf$placefp == "39075"] <- "39081"
tract.to.pl.rf$placefp[tract.to.pl.rf$county == "25021" & tract.to.pl.rf$placefp == "25172"] <- "25100"

range(nchar(trim(tract.to.pl.rf$fintract)))

tract.to.pl <- tract.to.pl.rf %>%
  filter(placenm != "Place name") %>%
  mutate(FIPS = str_pad(county, 5, "left", "0"),
         GEOID = paste(FIPS,fintract,sep="")) %>%
  select(GEOID,
         FIPS,
         afact,
         placefp,
         placenm) %>%
  mutate(afact_num = as.numeric(afact))

range(nchar(trim(tract.to.pl$GEOID)))
range(nchar(trim(tract.to.pl$FIPS)))

## limit to tracts in metro areas ##

load(paste0(output_path,
            "001_msa_del.Rda"))

range(nchar(trim(msa.del.2020.rd$FIPS)))
length(unique(msa.del.2020.rd$FIPS)) == nrow(msa.del.2020.rd)

tract.to.pl.metro.m <- stata.merge(tract.to.pl,
                                   msa.del.2020.rd,
                                   "FIPS")

## check merge ##
table(tract.to.pl.metro.m$merge.variable)

tract.to.pl.metro <- tract.to.pl.metro.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(GEOID_muni = paste0(`FIPS State Code`,
                             placefp)) %>%
  ## drop Framingham, MA because it is captured in cousub file
  filter(GEOID_muni != "2524960")

range(nchar(trim(tract.to.pl.metro$GEOID)))
range(nchar(trim(tract.to.pl.metro$placefp)))
range(nchar(trim(tract.to.pl.metro$GEOID_muni)))

## fix FIPS code issues ##
tract.to.pl.metro$GEOID_muni[tract.to.pl.metro$GEOID_muni == "2527100"] <- "2527060"
tract.to.pl.metro$placefp[tract.to.pl.metro$GEOID_muni == "2527060"] <- "27060"

tract.to.pl.metro$GEOID_muni[tract.to.pl.metro$GEOID_muni == "3983090"] <- "3983111"
tract.to.pl.metro$placefp[tract.to.pl.metro$GEOID_muni == "3983111"] <- "83111"

tract.to.pl.metro$GEOID_muni[tract.to.pl.metro$GEOID_muni == "4740240"] <- "4764668"
tract.to.pl.metro$placefp[tract.to.pl.metro$GEOID_muni == "4764668"] <- "64668"

tract.to.pl.metro$GEOID_muni[tract.to.pl.metro$GEOID_muni == "1239075"] <- "1239081"
tract.to.pl.metro$placefp[tract.to.pl.metro$GEOID_muni == "1239081"] <- "39081"

tract.to.pl.metro$GEOID_muni[tract.to.pl.metro$GEOID_muni == "1571550"] <- "1517000"
tract.to.pl.metro$placefp[tract.to.pl.metro$GEOID_muni == "1517000"] <- "17000"

save(tract.to.pl.metro,
     file = paste0(output_path,
                   "002_tract_to_pl.Rda"))

pl.metro <- tract.to.pl.metro %>%
  filter(!grepl("CDP", placenm) | GEOID_muni %in% c("1271625",
                                                    "1232400",
                                                    "2412150",
                                                    "2446725",
                                                    "1522700",
                                                    "5364365",
                                                    "1517000")) %>%
  select(GEOID_muni,
         cbsa10,
         `CBSA Title`) %>%
  unique() %>%
  rename(GEOID = GEOID_muni,
         cbsaname10 =  `CBSA Title`)

## now, cosubs ##

tract.to.cs.in <- readr::read_csv(paste0(input_path,
                                         "tract_to_cousub.csv"))

## clean up errors in the tract ID ##

tract.to.cs.rf <- tract.to.cs.in %>%
  mutate(tract.rf = str_replace_all(tract, '\\.', ''),
         fintract = str_pad(tract.rf, 6, "right", "0"))

range(nchar(trim(tract.to.cs.rf$fintract)))

## clean up errors in the tract ID ##

tract.to.cs.rf <- tract.to.cs.in %>%
  mutate(tract.rf = str_replace_all(tract, '\\.', ''),
         fintract = str_pad(tract.rf, 6, "right", "0"))

range(nchar(trim(tract.to.cs.rf$fintract)))

tract.to.cs <- tract.to.cs.rf %>%
  filter(cousubnm != "County subdivision name") %>%
  mutate(FIPS = str_pad(county, 5, "left", "0"),
         GEOID = paste(FIPS,fintract,sep="")) %>%
  select(GEOID,
         FIPS,
         afact,
         cousubfp,
         cousubnm) %>%
  mutate(afact_num = as.numeric(afact))

tract.to.cs$cousubnm <- iconv(tract.to.cs$cousubnm , from = "latin1", to = "UTF-8")


## fix FIPS code issues ##
tract.to.cs$cousubfp[tract.to.cs$FIPS == "25005" & tract.to.cs$cousubfp == "46575"] <- "46598"
tract.to.cs$cousubfp[tract.to.cs$FIPS == "25017" & tract.to.cs$cousubfp == "24925"] <- "24960"
tract.to.cs$cousubfp[tract.to.cs$FIPS == "25021" & tract.to.cs$cousubfp == "78972"] <- "78865"
tract.to.cs$cousubfp[tract.to.cs$FIPS == "25025" & tract.to.cs$cousubfp == "81005"] <- "80930"


range(nchar(trim(tract.to.cs$GEOID)))
range(nchar(trim(tract.to.cs$FIPS)))

range(nchar(trim(msa.del.2020.rd$FIPS)))
length(unique(msa.del.2020.rd$FIPS)) == nrow(msa.del.2020.rd)


## limit to tracts in metro areas ##

tract.to.cs.metro.m <- stata.merge(tract.to.cs,
                                   msa.del.2020.rd,
                                   "FIPS")

## check merge ##
table(tract.to.cs.metro.m$merge.variable)

tract.to.cs.metro <- tract.to.cs.metro.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(GEOID_muni = paste0(`FIPS State Code`,
                             cousubfp),
         GEOID_muni_full = paste0(`FIPS State Code`,
                                  `FIPS County Code`,
                                  cousubfp))

tract.to.cs.metro$GEOID_muni_full[tract.to.cs.metro$GEOID_muni_full == "2500546575"] <- "2500546598"
tract.to.cs.metro$GEOID_muni[tract.to.cs.metro$GEOID_muni_full == "2500546598"] <- "2546598"
tract.to.cs.metro$cousubfp[tract.to.cs.metro$GEOID_muni_full == "2500546598"] <- "46598"

range(nchar(trim(tract.to.cs.metro$GEOID)))
range(nchar(trim(tract.to.cs.metro$cousubfp)))

save(tract.to.cs.metro,
     file = paste0(output_path,
                   "002_tract_to_cs.Rda"))

cs.metro <- tract.to.cs.metro %>%
  select(-GEOID) %>%
  filter(!grepl("CCD", cousubnm)) %>%
  mutate(GEOID = paste0(`FIPS State Code`,
                        cousubfp),
         GEOID_full = paste0(`FIPS State Code`,
                             `FIPS County Code`,
                             cousubfp)) %>%
  select(GEOID,
         GEOID_full,
         cbsa10,
         `CBSA Title`) %>%
  unique() %>%
  rename(cbsaname10 =  `CBSA Title`)


####################################
## Merge WRLD 2006 and NLLUS 2003 ##
####################################

## merge checks ##

nrow(wrld.2006.final) == length(unique(wrld.2006.final$GEOID))
class(wrld.2006.final$GEOID)
range(nchar(trim(wrld.2006.final$GEOID)))

nllus.2003.fm <- nllus.2003.final %>%
  select(-statename,
         -name)

nrow(nllus.2003.fm) == length(unique(nllus.2003.fm$GEOID))
class(nllus.2003.fm$GEOID)
range(nchar(trim(nllus.2003.fm$GEOID)))

wrld.nllus.merged <- stata.merge(wrld.2006.final,
                                 nllus.2003.fm,
                                 "GEOID")

## diagnose merge ##
table(wrld.nllus.merged$merge.variable)


## create file, keeping matches 1 and 3 ##
wrld.nllus.cb <- wrld.nllus.merged %>%
  filter(merge.variable %in% c(1,3)) %>%
  select(-merge.variable)

wrld.nllus.cb$sindex1 <- rowSums(wrld.nllus.cb[,c("restrict_sf_permit", 
                                                  "restrict_mf_permit",
                                                  "limit_sf_units",
                                                  "limit_mf_units",
                                                  "limit_mf_dwellings",
                                                  "limit_mf_dwelling_units")], na.rm=TRUE)

wrld.nllus.cb$sindex2 <- wrld.nllus.cb$open_space

wrld.nllus.cb <- wrld.nllus.cb %>%
  mutate(sindex3 = case_when(two_acre_more == 1 ~ 4,
                             one_acre_more == 1 & two_acre_more == 0 ~ 3,
                             half_acre_more == 1 & two_acre_more == 0 & one_acre_more == 0 ~ 2, 
                             half_acre_less == 1 & two_acre_more == 0 & one_acre_more == 0 & half_acre_more == 0 ~ 1)) 

wrld.nllus.cb$sindex4 <- wrld.nllus.cb$total_nz

wrld.nllus.cb$sindex5 <- wrld.nllus.cb$total_rz

wrld.nllus.cb <- wrld.nllus.cb %>%
  mutate(sindex6 = case_when(maxden5 == 1 ~ 1,
                             maxden4 == 1 ~ 2,
                             maxden3 == 1 ~ 3,
                             maxden2 == 1 ~ 4,
                             maxden1 == 1 ~ 5))

wrld.nllus.cb$sindex7 <- wrld.nllus.cb$inclusionary


## create zri ##
wrld.nllus.2006.final <- create.zri(wrld.nllus.cb)

## output file ##

save(wrld.nllus.2006.final,
     file = paste(output_path,
                  "002_wrld_nllus_place_2006.Rda",
                  sep=""))

################################
## assign 2006 places to MSAs ##
################################

## merge checks ## 
nrow(wrld.nllus.2006.final) == length(unique(wrld.nllus.2006.final$GEOID))
class(wrld.nllus.2006.final$GEOID)
range(nchar(trim(wrld.nllus.2006.final$GEOID)))

nrow(pl.metro) == length(unique(pl.metro$GEOID))
class(pl.metro$GEOID)
range(nchar(trim(pl.metro$GEOID)))

## merge data frames ## 
wrld.nllus.2006.msa.merge <- stata.merge(wrld.nllus.2006.final,
                                         pl.metro,
                                         "GEOID")

## check merge ## 
table(wrld.nllus.2006.msa.merge$merge.variable)

## output non-matches eligible for match with county subs ##
no.msa.wrld.nllus.2006 <- wrld.nllus.2006.msa.merge %>%
  filter(merge.variable ==1) %>%
  select(-cbsa10,
         -cbsaname10,
         -merge.variable)

## keep matches ## 
wrld.nllus.msa.2006.keep1 <- wrld.nllus.2006.msa.merge %>%
  filter(merge.variable ==3) %>%
  select(-merge.variable) %>%
  mutate(GEOID_full = GEOID)

## check for dups ## 
## these are places that span multiple MSAs ##
wrld.nllus.msa.dupcheck1 <- wrld.nllus.msa.2006.keep1 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## now, county subs ## 

## merge checks ##
nrow(cs.metro) == length(unique(cs.metro$GEOID))
class(cs.metro$GEOID)
range(nchar(trim(cs.metro$GEOID)))

nrow(no.msa.wrld.nllus.2006) == length(unique(no.msa.wrld.nllus.2006$GEOID))
class(no.msa.wrld.nllus.2006$GEOID)
range(nchar(trim(no.msa.wrld.nllus.2006$GEOID)))

## merge data frames ## 
wrld.nllus.2006.cs.merge <- stata.merge(no.msa.wrld.nllus.2006,
                                        cs.metro,
                                        "GEOID")

## check merge ##
table(wrld.nllus.2006.cs.merge$merge.variable)

## keep matches ## 
wrld.nllus.msa.2006.keep2 <- wrld.nllus.2006.cs.merge %>%
  filter(merge.variable ==3) %>%
  select(-merge.variable)

## check for dups ## 
## these are places that span multiple MSAs ##
wrld.nllus.msa.dupcheck2 <- wrld.nllus.msa.2006.keep2 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## append the two matched dataframes ##

wrld.nllus.2006.wmsas <- rbind(wrld.nllus.msa.2006.keep1,
                               wrld.nllus.msa.2006.keep2)


## export these munis for later ##
wrld.nllus.2006.wmsas.out <- wrld.nllus.2006.wmsas %>%
  select(GEOID)

save(wrld.nllus.2006.wmsas.out,
     file = paste(output_path,
                  "002_wrld_nllus_msasample_2006.Rda",
                  sep=""))

## create MSA level file ## 

wrld.nllus.msa.2006.temp <- wrld.nllus.2006.wmsas %>%
  group_by(cbsa10) %>%
  summarize(cbsaname10 = cbsaname10[which(cbsaname10 != "")[1]],
            responses = n(),
            restrict_sf_permit = mean(restrict_sf_permit, na.rm=T),
            restrict_mf_permit = mean(restrict_mf_permit, na.rm=T),
            limit_sf_units = mean(limit_sf_units, na.rm=T),
            limit_mf_units = mean(limit_mf_units, na.rm=T),
            limit_mf_dwellings = mean(limit_mf_dwellings, na.rm=T),
            limit_mf_dwelling_units = mean(limit_mf_dwelling_units, na.rm=T),
            min_lot_size = mean(min_lot_size, na.rm=T),
            open_space = mean(open_space, na.rm=T),
            inclusionary = mean(inclusionary, na.rm=T),
            half_acre_less = mean(half_acre_less, na.rm=T),
            half_acre_more = mean(half_acre_more, na.rm=T),
            one_acre_more = mean(one_acre_more, na.rm=T),
            two_acre_more = mean(two_acre_more, na.rm=T),
            total_nz = mean(total_nz, na.rm=T),
            total_rz = mean(total_rz, na.rm=T),
            maxden5 = mean(maxden5, na.rm=T),
            maxden4 = mean(maxden4, na.rm=T),
            maxden3 = mean(maxden3, na.rm=T),
            maxden2 = mean(maxden2, na.rm=T),
            maxden1 = mean(maxden1, na.rm=T),
            zri_median = median(zri, na.rm=T),
            zri_range = abs(max(zri, na.rm=T) - min(zri,na.rm=T)), .groups = "drop")

## re-standardize final index ## 

wrld.nllus.msa.2006.temp$zri_median_st <- (wrld.nllus.msa.2006.temp$zri_median - mean(wrld.nllus.msa.2006.temp$zri_median, na.rm=T))/sd(wrld.nllus.msa.2006.temp$zri_median,na.rm=T)
wrld.nllus.msa.2006.temp$zri_range_st <- (wrld.nllus.msa.2006.temp$zri_range - mean(wrld.nllus.msa.2006.temp$zri_range, na.rm=T))/sd(wrld.nllus.msa.2006.temp$zri_range,na.rm=T)

## output file ##

save(wrld.nllus.msa.2006.temp,
     file = paste(output_path,
                  "002_wrld_nllus_msa_2006.Rda",
                  sep=""))

#####################
## NZLU index file ##
#####################

## change values for munis that are verified as incorrect ##

## Barre, MA ## 
nzlu.2022.final$restrict_sf_permit[nzlu.2022.final$GEOID == "2503740"] <- 1
nzlu.2022.final$restrict_mf_permit[nzlu.2022.final$GEOID == "2503740"] <- 1
nzlu.2022.final$limit_sf_units[nzlu.2022.final$GEOID == "2503740"] <- 1
nzlu.2022.final$limit_mf_units[nzlu.2022.final$GEOID == "2503740"] <- 1
nzlu.2022.final$limit_mf_dwellings[nzlu.2022.final$GEOID == "2503740"] <- 1

## Orlando, FL ## 
nzlu.2022.final$restrict_sf_permit[nzlu.2022.final$GEOID == "1253000"] <- 0
nzlu.2022.final$restrict_mf_permit[nzlu.2022.final$GEOID == "1253000"] <- 0
nzlu.2022.final$limit_sf_units[nzlu.2022.final$GEOID == "1253000"] <- 0
nzlu.2022.final$limit_mf_units[nzlu.2022.final$GEOID == "1253000"] <- 0
nzlu.2022.final$limit_mf_dwellings[nzlu.2022.final$GEOID == "1253000"] <- 0
nzlu.2022.final$limit_mf_dwelling_units[nzlu.2022.final$GEOID == "1253000"] <- 0

## Winston Salem, NC ## 
nzlu.2022.final$restrict_sf_permit[nzlu.2022.final$GEOID == "3775000"] <- 0
nzlu.2022.final$restrict_mf_permit[nzlu.2022.final$GEOID == "3775000"] <- 0
nzlu.2022.final$limit_sf_units[nzlu.2022.final$GEOID == "3775000"] <- 0
nzlu.2022.final$limit_mf_units[nzlu.2022.final$GEOID == "3775000"] <- 0
nzlu.2022.final$limit_mf_dwellings[nzlu.2022.final$GEOID == "3775000"] <- 0
nzlu.2022.final$limit_mf_dwelling_units[nzlu.2022.final$GEOID == "3775000"] <- 0

## Palm Beach Gardens, FL ##
nzlu.2022.final$restrict_sf_permit[nzlu.2022.final$GEOID == "1254075"] <- 0
nzlu.2022.final$restrict_mf_permit[nzlu.2022.final$GEOID == "1254075"] <- 0
nzlu.2022.final$limit_sf_units[nzlu.2022.final$GEOID == "1254075"] <- 0
nzlu.2022.final$limit_mf_units[nzlu.2022.final$GEOID == "1254075"] <- 0

## carry on with processing ##

nzlu.2022.final$sindex1 <- rowSums(nzlu.2022.final[,c("restrict_sf_permit", 
                                                      "restrict_mf_permit",
                                                      "limit_sf_units",
                                                      "limit_mf_units",
                                                      "limit_mf_dwellings",
                                                      "limit_mf_dwelling_units")], na.rm=TRUE)

nzlu.2022.final$sindex2 <- nzlu.2022.final$open_space

nzlu.2022.final <- nzlu.2022.final %>%
  mutate(sindex3 = case_when(two_acre_more == 1 ~ 4,
                             one_acre_more == 1 & two_acre_more == 0 ~ 3,
                             half_acre_more == 1 & two_acre_more == 0 & one_acre_more == 0 ~ 2, 
                             half_acre_less == 1 & two_acre_more == 0 & one_acre_more == 0 & half_acre_more == 0 ~ 1)) 

nzlu.2022.final$sindex4 <- nzlu.2022.final$total_nz

nzlu.2022.final$sindex5 <- nzlu.2022.final$total_rz

nzlu.2022.final <- nzlu.2022.final %>%
  mutate(sindex6 = case_when(maxden5 == 1 ~ 1,
                             maxden4 == 1 ~ 2,
                             maxden3 == 1 ~ 3,
                             maxden2 == 1 ~ 4,
                             maxden1 == 1 ~ 5))

nzlu.2022.final$sindex7 <- nzlu.2022.final$inclusionary

## CA fixes ##
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "0614736"] <- 2
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "0620956"] <- 1

## TX fixes ##

nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4803144"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4806060"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4811300"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4820140"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4833068"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4834502"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4836092"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4849128"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4855008"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4860164"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4863044"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4867964"] <- 3
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4871384"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4872989"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4876228"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4876948"] <- 1
nzlu.2022.final$sindex3[nzlu.2022.final$GEOID == "4877416"] <- 1


## create zri ##

nzlu.2022.final.zri <- create.zri(nzlu.2022.final)

## checks ## 
sum(is.na(nzlu.2022.final.zri$zri))

## fix GEOIDs ##
nzlu.2022.final.zri$GEOID[nzlu.2022.final.zri$GEOID == "1571550"] <- "1517000"
nzlu.2022.final.zri$GEOID[nzlu.2022.final.zri$GEOID == "1983055"] <- "1983145"
nzlu.2022.final.zri$GEOID[nzlu.2022.final.zri$GEOID == "2578972"] <- "2578865"
nzlu.2022.final.zri$GEOID[nzlu.2022.final.zri$GEOID == "2527100"] <- "2527060"
nzlu.2022.final.zri$GEOID[nzlu.2022.final.zri$GEOID == "2365725"] <- "2365760"


## output file ##

save(nzlu.2022.final.zri,
     file = paste(output_path,
                  "002_nzlu_muni_2022.Rda",
                  sep=""))

#######################################
## assign source 2022 places to MSAs ##
#######################################

## merge checks ## 
nrow(nzlu.2022.final.zri) == length(unique(nzlu.2022.final.zri$GEOID))
class(nzlu.2022.final.zri$GEOID)
range(nchar(trim(nzlu.2022.final.zri$GEOID)))
sum(nchar(nzlu.2022.final.zri$GEOID) < 7)

nrow(pl.metro) == length(unique(pl.metro$GEOID))
class(pl.metro$GEOID)
range(nchar(trim(pl.metro$GEOID)))

## merge data frames ## 
nzlu.2022.msa.merge <- stata.merge(nzlu.2022.final.zri,
                                   pl.metro,
                                   "GEOID")

## check merge ## 
table(nzlu.2022.msa.merge$merge.variable)

## output non-matches eligible for match with county subs ##
no.msa.nzlu.2022 <- nzlu.2022.msa.merge %>%
  filter(merge.variable ==1) %>%
  select(-merge.variable,
         -cbsa10,
         -cbsaname10)

## keep matches ## 
nzlu.2022.msa.keep1 <- nzlu.2022.msa.merge %>%
  filter(merge.variable ==3) %>%
  select(-merge.variable) %>%
  mutate(GEOID_full = GEOID)

## check for dups ## 
## these are places that span multiple MSAs ##
nzlu.2022.dupcheck1 <- nzlu.2022.msa.keep1 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## now, county subs ## 

## merge checks ##
nrow(cs.metro) == length(unique(cs.metro$GEOID))
class(cs.metro$GEOID)
range(nchar(trim(cs.metro$GEOID)))

nrow(no.msa.nzlu.2022) == length(unique(no.msa.nzlu.2022$GEOID))
class(no.msa.nzlu.2022$GEOID)
range(nchar(trim(no.msa.nzlu.2022$GEOID)))

## merge data frames ## 
nzlu.2022.cs.merge <- stata.merge(no.msa.nzlu.2022,
                                  cs.metro,
                                  "GEOID")

## check merge ##
table(nzlu.2022.cs.merge$merge.variable)

## keep matches ## 
nzlu.2022.msa.keep2 <- nzlu.2022.cs.merge %>%
  filter(merge.variable ==3) %>%
  select(-merge.variable)

## check for dups ## 
## these are places that span multiple MSAs ##
nzlu.2022.dupcheck2 <- nzlu.2022.msa.keep2 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## append the two matched dataframes ##

nzlu.2022.wmsas <- rbind(nzlu.2022.msa.keep1,
                         nzlu.2022.msa.keep2)

nzlu.2022.wmsas$state <- nzlu.2022.wmsas$statename

## export these munis for later ##

save(nzlu.2022.wmsas,
     file = paste(output_path,
                  "002_nzlu_wmsas_2022.Rda",
                  sep=""))



nzlu.2022.wmsas.out <- nzlu.2022.wmsas %>%
  select(GEOID)

save(nzlu.2022.wmsas.out,
     file = paste(output_path,
                  "002_nzlu_msasample_2022.Rda",
                  sep=""))

## create MSA level file ## 

nzlu.msa.2022.pt1 <- nzlu.2022.wmsas %>%
  group_by(cbsa10) %>%
  summarize(cbsaname10 = cbsaname10[which(cbsaname10 != "")[1]],
            responses = n(),
            restrict_sf_permit = mean(restrict_sf_permit, na.rm=T),
            restrict_mf_permit = mean(restrict_mf_permit, na.rm=T),
            limit_sf_units = mean(limit_sf_units, na.rm=T),
            limit_mf_units = mean(limit_mf_units, na.rm=T),
            limit_mf_dwellings = mean(limit_mf_dwellings, na.rm=T),
            limit_mf_dwelling_units = mean(limit_mf_dwelling_units, na.rm=T),
            min_lot_size = mean(min_lot_size, na.rm=T),
            open_space = mean(open_space, na.rm=T),
            inclusionary = mean(inclusionary, na.rm=T),
            half_acre_less = mean(half_acre_less, na.rm=T),
            half_acre_more = mean(half_acre_more, na.rm=T),
            one_acre_more = mean(one_acre_more, na.rm=T),
            two_acre_more = mean(two_acre_more, na.rm=T),
            total_nz = mean(total_nz, na.rm=T),
            total_rz = mean(total_rz, na.rm=T),
            maxden5 = mean(maxden5, na.rm=T),
            maxden4 = mean(maxden4, na.rm=T),
            maxden3 = mean(maxden3, na.rm=T),
            maxden2 = mean(maxden2, na.rm=T),
            maxden1 = mean(maxden1, na.rm=T),
            adu = mean(adu, na.rm=T),
            height_ft_median = mean(height_ft_median, na.rm=T),
            height_ft_mode = mean(height_ft_mode, na.rm=T),
            height_st_median = mean(height_st_median, na.rm=T),
            height_st_mode = mean(height_st_mode,na.rm=T),
            parking_median = mean(parking_median,na.rm=T),
            parking_mode = mean(parking_mode,na.rm=T),
            zri_median = median(zri, na.rm=T),
            zri_range = abs(max(zri, na.rm=T) - min(zri,na.rm=T)), .groups = "drop") %>%
  filter(!is.na(zri_median)) 

## re-standardize final index ## 

nzlu.msa.2022.pt1$zri_median_st <- (nzlu.msa.2022.pt1$zri_median - mean(nzlu.msa.2022.pt1$zri_median, na.rm=T))/sd(nzlu.msa.2022.pt1$zri_median,na.rm=T)
nzlu.msa.2022.pt1$zri_range_st <- (nzlu.msa.2022.pt1$zri_range - mean(nzlu.msa.2022.pt1$zri_range, na.rm=T))/sd(nzlu.msa.2022.pt1$zri_range,na.rm=T)

nzlu.msa.2022.pt1look <- nzlu.msa.2022.pt1 %>%
  select(cbsa10,
         cbsaname10,
         zri_median,
         zri_median_st,
         zri_range,
         zri_range_st)

nzlu.msas.2022 <- nzlu.2022.wmsas %>%
  select(GEOID,
         statename,
         place,
         cbsa10,
         cbsaname10,
         zri,
         zri_st) 

## now, let's create the second msa level ZRI dimension ##

## merge on central cities ##

load("ccs_final.Rda")

## fix Honolulu GEOID ##
ccs.final$GEOID[ccs.final$GEOID == "1571550"] <- "1517000"

nzlu.msa.2022.pt2.m <- stata.merge(nzlu.2022.wmsas,
                                   ccs.final,
                                   "GEOID")

## check merge ##
table(nzlu.msa.2022.pt2.m$merge.variable, useNA = "ifany")

## keep all obs, create cc indicator ## 

nzlu.msa.2022.pt2.temp <- nzlu.msa.2022.pt2.m %>%
  filter(merge.variable %in% c(1,3)) %>%
  mutate(cc = ifelse(merge.variable == 3, 1, 0)) %>%
  select(-merge.variable)

nzlu.msa.2022.pt2a <- nzlu.msa.2022.pt2.temp %>%
  filter(cc == 1) %>%
  group_by(cbsa10) %>%
  summarize(cbsaname10 = first(cbsaname10),
            cc_zri = median(zri,na.rm=T), .groups = "drop") 

nzlu.msa.2022.pt2b <- nzlu.msa.2022.pt2.temp %>%
  filter(cc == 0 & !is.na(zri)) %>%
  group_by(cbsa10) %>%
  summarize(cc_zri_max = max(zri,na.rm=T), .groups = "drop") 

print("ROWS PRIOR TO NZLU MSA 2022 PT 2")
nrow(nzlu.msa.2022.pt2a)
nrow(nzlu.msa.2022.pt2b)

nzlu.msa.2022.pt2 <- nzlu.msa.2022.pt2a %>%
  left_join(nzlu.msa.2022.pt2b, "cbsa10") %>%
  select(cbsa10,
         cc_zri,
         cc_zri_max) %>%
  mutate(zri_diff = cc_zri_max - cc_zri,
         zri_abs_diff = abs(cc_zri_max-cc_zri))

print("ROWS OF NZLU MSA 2022 PT1")
nrow(nzlu.msa.2022.pt1)
print("ROWS OF NZLU MSA 2022 PT2")
nrow(nzlu.msa.2022.pt2)

nzlu.msa.2022.final <- nzlu.msa.2022.pt1 %>%
  left_join(nzlu.msa.2022.pt2, "cbsa10") %>%
  mutate(zri_full = zri_median + zri_diff)

print("FINAL MSA ROWS")
nrow(nzlu.msa.2022.final)

nzlu.msa.2022.final$zri_full_st <- (nzlu.msa.2022.final$zri_full - mean(nzlu.msa.2022.final$zri_full, na.rm=T))/sd(nzlu.msa.2022.final$zri_full,na.rm=T)

## check ##

nzlu.msa.2022.finlook <- nzlu.msa.2022.final %>%
  select(cbsa10,
         cbsaname10,
         zri_median,
         zri_diff,
         zri_full,
         zri_full_st)

## output file ##

save(nzlu.msa.2022.final,
     file = paste(output_path,
                  "002_nzlu_msa_2022.Rda",
                  sep=""))


## END OF PROGRAM ##


#sink()
